Fecha de creación: 06/09/2022
Grupo: fraud detection (2)
Integrantes:
Resumen
El objetivo del presente notebook es probar diferentes alternativas para detección de outliers (fraudes) en el dataset de transacciones bancarias.
import json
import os
import warnings
import time
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn_pandas import DataFrameMapper
warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True)
# Reading data
ROOT_PATH = os.path.dirname(os.getcwd())
DATA_PATH = os.path.join(ROOT_PATH, 'data', 'raw')
df1 = pd.read_csv(os.path.join(DATA_PATH, 'fraud_data_pt1.csv'), index_col=0)
df2 = pd.read_csv(os.path.join(DATA_PATH, 'fraud_data_pt12.csv'), index_col=0)
df = pd.concat([df1, df2], ignore_index=True)
del df1, df2
target = df[['Class']]
df = df.drop('Class', axis=1)
Vamos a aplicar StandardScaler sobre las features, para luego tratar de reducir la dimensionalidad con PCA a un espacio que se pueda graficar.
scaler = StandardScaler()
mapper = DataFrameMapper([(df.columns, scaler)])
scaled_features = mapper.fit_transform(df.copy())
scaled_features_df = pd.DataFrame(
scaled_features,
index=df.index,
columns=df.columns
)
# Calculando, guardando y analizando el explained_variance_ratio_ para n componentes
ratios = {}
for n in range(1, df.shape[1] + 1):
pca = PCA(n_components=n, random_state=42)
pca.fit(scaled_features_df)
explained_ratio = pca.explained_variance_ratio_
total_ratio = explained_ratio.sum()
ratios[n] = total_ratio
#print(f'{n} components explained variance ratio:')
#print(explained_ratio)
#print(f'Total: {total_ratio}\n')
# Grafico de explicabilidad por numero de componentes
df_exp_ratios = pd.DataFrame(
data=ratios.items(),
columns=['n_components', 'sum_explained_ratio']
)
fig = px.line(df_exp_ratios, x='n_components', y='sum_explained_ratio',
title='N explained variance ratio (PCA)', markers=True)
fig.show()
En otras palabras, la información presente en el conjunto de datos es dificil de proyectar en un espacio menor sin perder explainability.
Si 2 o 3 componentes concentraran una explainability aceptable podriamos usarlos para plotear el agrupamiento.
Funciones genericas con diversa utilidad.
def process_metrics(y_true, y_pred):
"""Compute metrics based on model predictions.
Args:
- y_true: ground truth.
- y_pred: prediction (score).
Returns:
- metrics: dictionary with all computed metrics.
"""
metrics = {}
metrics['precision_score'] = precision_score(y_true, y_pred)
metrics['recall_score'] = recall_score(y_true, y_pred)
metrics['f1_score'] = f1_score(y_true, y_pred)
return metrics
def print_as_json(dic):
"""Just for a pretty dict print"""
print(json.dumps(dic, indent=4))
def plot_confusion_matrix(y_true, y_pred):
"""Plot confusion matrix
Args:
- Ground truth and model predictions
"""
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(16,6))
cm = confusion_matrix(y_true, y_pred)
cm_display = ConfusionMatrixDisplay(
confusion_matrix=cm,
display_labels=['No fraude', 'Fraude']
)
cm_display.plot(cmap=plt.cm.Blues, values_format='g',
colorbar=False, ax=axes)
axes.title.set_text('Confusion matrix')
#test_confusion_matrix = confusion_matrix(y_test, test_preds)
#cm_display = ConfusionMatrixDisplay(
# confusion_matrix=test_confusion_matrix,
# display_labels=['No fraude', 'Fraude']
#)
#cm_display.plot(cmap=plt.cm.Blues,values_format='g',
# colorbar=False, ax=axes[1])
#axes[1].title.set_text('Test matrix')
scaled_features_df['Class'] = target
X = scaled_features_df.drop('Class', axis=1)
y = scaled_features_df[['Class']]
%%time
# Vanilla model
lof_model = LocalOutlierFactor(n_neighbors=20, novelty=True, n_jobs=-1)
lof_model.fit(X)
CPU times: user 13min 9s, sys: 4.24 s, total: 13min 13s Wall time: 1min 51s
LocalOutlierFactor(n_jobs=-1, novelty=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LocalOutlierFactor(n_jobs=-1, novelty=True)
%%time
# Predict the labels (1 inlier, -1 outlier) of X according to LOF.
lof_preds = lof_model.predict(X)
CPU times: user 13min 17s, sys: 5.11 s, total: 13min 22s Wall time: 1min 38s
# Mapping frauds with outliers
lof_preds = (lof_preds == -1).astype(int)
# Computing metrics
lof_metrics = process_metrics(y, lof_preds)
print('Metricas obtenidas:')
print_as_json(lof_metrics)
Metricas obtenidas:
{
"precision_score": 0.004128183599216345,
"recall_score": 0.11991869918699187,
"f1_score": 0.00798160173160173
}
plot_confusion_matrix(y, lof_preds)
El modelo es bastante malo separando los fraudes del resto de las transacciones.
%%time
# Vanilla model
isf_model = IsolationForest(n_estimators=300, random_state=42, n_jobs=-1)
isf_model.fit(X)
CPU times: user 4.7 s, sys: 1.27 s, total: 5.97 s Wall time: 1.24 s
IsolationForest(n_estimators=300, n_jobs=-1, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
IsolationForest(n_estimators=300, n_jobs=-1, random_state=42)
# The predict function will assign a value of either 1 or -1 to each sample in X.
# A value of 1 indicates that a point is a normal point
# while a value of -1 indicates that it is an anomaly (Like LOF).
isf_preds = isf_model.predict(X)
# Mapping frauds with outliers
isf_preds = (isf_preds == -1).astype(int)
# Computing train and test metrics
isf_metrics = process_metrics(y, isf_preds)
print('Metricas obtenidas:')
print_as_json(isf_metrics)
Metricas obtenidas:
{
"precision_score": 0.03893203883495146,
"recall_score": 0.8150406504065041,
"f1_score": 0.07431430689399557
}
plot_confusion_matrix(y, isf_preds)
Isolation Forest, si bien tiene sus limitaciones, es capaz de separar de una mejor forma los fraudes del resto de transacciones. Pero a un costo alto de falsos positivos (menor que el de LOF).
Se aplicará PCA para crear 3 componentes y visualizar inliners y outliers sobre un subsample del dataset.
pca = PCA(n_components=3)
reduced_df = pd.DataFrame(
data=pca.fit_transform(X),
columns=['component_1', 'component_2', 'component_3']
)
reduced_df['Target'] = target
# Undersampling casero
n_samples = 5000
frauds = reduced_df[reduced_df.Target == 1].copy()
no_frauds = reduced_df[reduced_df.Target == 0].copy()
no_frauds = no_frauds.sample(n=n_samples)
df_to_plot = pd.concat([frauds, no_frauds], axis=0)
fraud_map = {
0: 'No fraude',
1: 'Fraude'
}
df_to_plot['Target'] = df_to_plot['Target'].map(fraud_map)
fig = px.scatter_3d(df_to_plot, x='component_1', y='component_2',
z='component_3', color='Target', width=1000, height=600,
title='Anomaly detection mediante Isolation Forest')
fig.show()
Como era de esperar debido a la poca interpretabilidad contenida en 3 dimensiones, no se observa una clara separación de las clases.